Integration with public data

In this script I integrate the mouse neuron data with publically available datasets and compare PCA and WGCNA for gene module discovery.

Load the required R libraries:

library(readxl)
library(dplyr)
library(Seurat)
library(myUtils)
library(EnsDb.Mmusculus.v75)
library(lubridate)
library(ComplexHeatmap)
library(R.matlab)
library(WGCNA)
library(biomaRt)
library(org.Mm.eg.db)
library(gridExtra)
library(grid)

Here I 1.) load the data 2.) change the gene identifiers from ensemble ids to symbols, where this is possible (otherwise I leave the ensembl id in place) 3.) load the metadata 4.) I put the data and metadata into a Seurat object. 5.) I also load mitochondrial mouse genes from a database for later use.

directory = '/home/jovyan/axonGrowth/'
setwd(directory)
axonGrowth_counts = read.csv('counts.csv', row.names = 1)
rownames(axonGrowth_counts) = unlist(lapply(1:length(rownames(axonGrowth_counts)), function(x) strsplit(rownames(axonGrowth_counts)[x], split = '\\.')[[1]][1]))
symbols = mapIdsMouse(rownames(axonGrowth_counts), 'ENSEMBL', 'SYMBOL')
mitogenes <- genes(EnsDb.Mmusculus.v75, filter = ~ seq_name == "MT")$gene_id
percent.mt = colSums(axonGrowth_counts[rownames(axonGrowth_counts) %in% mitogenes,])/colSums(axonGrowth_counts)
rownames(axonGrowth_counts) = symbols
#sum(unlist(lapply(1:dim(axonGrowth_counts)[1], function(x) substring(rownames(axonGrowth_counts)[x], 1,3))) != 'ENS')/dim(axonGrowth_counts)[1]
metadata = read_xlsx('Summary-neurons_microcoverslips_RNAseq.xlsx', col_types = c('text', 'text', 'date', 'date', 'date', 'date', 'date', rep('text', 7)), skip = 1)
metadata = metadata[10:105,]
axonGrowth = CreateSeuratObject(axonGrowth_counts, project = 'axonGrowth', min.cells = 0, min.features = 0)
axonGrowth$TotalNeuriteLength = as.numeric(metadata$`Total Neurite Length (um)`)
axonGrowth$LongestNeuriteLength = as.numeric(metadata$`Longest Neurite Length (um)`)
axonGrowth$TimeSincePlating = unlist(lapply(1:dim(metadata)[1], function(x) interval(ymd_hms(metadata$`Time of plating (mm/dd/yyyy hh:mm)`[x]), ymd_hms(metadata$`Cell collection time (mm/dd/yyyy hh:mm)`[x]))/dhours(1)))
axonGrowth$TimeSincePlating = unlist(lapply(1:dim(metadata)[1], function(x) interval(ymd_hms(metadata$`Time of plating (mm/dd/yyyy hh:mm)`[x]), ymd_hms(metadata$`Cell collection time (mm/dd/yyyy hh:mm)`[x]))/dhours(1)))
axonGrowth$TimeInAIRMEM = unlist(lapply(1:dim(metadata)[1], function(x) interval(ymd_hms(metadata$`AIR-MEM change (mm/dd/yyyy hh:mm)`[x]), ymd_hms(metadata$`Cell collection time (mm/dd/yyyy hh:mm)`[x]))/dhours(1)))

The QC plots using number of detected genes, number of counts and percent of counts coming from mitochondrial genes (as a proxy for stress), show a couple of outlier cells, which I remove in the last line:

axonGrowth[["percent.mt"]] <- percent.mt
VlnPlot(axonGrowth, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3)

plot1 <- FeatureScatter(axonGrowth, feature1 = "nCount_RNA", feature2 = "percent.mt")
plot2 <- FeatureScatter(axonGrowth, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
CombinePlots(plots = list(plot1, plot2))

axonGrowth <- subset(axonGrowth, subset = nFeature_RNA > 10000 & nFeature_RNA < 20000 & percent.mt < 2.5 & nCount_RNA > 10^5 & nCount_RNA < 10^6)
VlnPlot(axonGrowth, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3)

The following normalizes, scales and select 1000 particularly variable genes. Doing the downstream analysis with only the 1000 most variable genes seems like a waste of information. However, given we only have 85 high quality cells available for this analysis at the moment, it will give us the most robust clustering results.

axonGrowth <- NormalizeData(axonGrowth, normalization.method = "LogNormalize", scale.factor = 10000)
axonGrowth <- FindVariableFeatures(axonGrowth, selection.method = "vst", nfeatures = 1000)
top25 <- head(VariableFeatures(axonGrowth), 25)
plot1 <- VariableFeaturePlot(axonGrowth)
plot2 <- LabelPoints(plot = plot1, points = top25, repel = TRUE)
CombinePlots(plots = list(plot1, plot2))

all.genes <- rownames(axonGrowth)
axonGrowth <- ScaleData(axonGrowth, features = all.genes)

These are the results of the PCA analysis:

axonGrowth <- RunPCA(axonGrowth, features = VariableFeatures(object = axonGrowth))
print(axonGrowth[["pca"]], dims = 1:5, nfeatures = 5)
## PC_ 1 
## Positive:  ENSMUSG00000105285, ENSMUSG00000111824, ENSMUSG00000092805, Tmem170, ENSMUSG00000106451 
## Negative:  Ppia, Rtn3, Hnrnpk, Calm1, ENSMUSG00000044285 
## PC_ 2 
## Positive:  Cyba, Ctss, Lyz2, Tyrobp, Hexb 
## Negative:  Stmn4, Mapt, Nrep, Map1b, Stmn2 
## PC_ 3 
## Positive:  Cldn3, Rpl17, Smurf2, Prxl2a, Zfp334 
## Negative:  Syt3, Lao1, Tspan12, Ado, Pcdhb16 
## PC_ 4 
## Positive:  Zscan18, Rtf2, Kctd2, Ddit3, Rps6kc1 
## Negative:  Zdhhc8, Acap3, Stard7, Pgm1, Aatk 
## PC_ 5 
## Positive:  ENSMUSG00000096972, Slc19a1, Ndufv1, Atg101, Irf3 
## Negative:  Gadd45g, Slc38a2, Plppr1, Gli3, Cdpf1
VizDimLoadings(axonGrowth, dims = 1:2, reduction = "pca")

DimPlot(axonGrowth, reduction = "pca")

DimHeatmap(axonGrowth, dims = 1, cells = round(dim(axonGrowth)[2])/2, balanced = TRUE)

DimHeatmap(axonGrowth, dims = 1:15, cells = round(dim(axonGrowth)[2])/2, balanced = TRUE)

Based on the JackStraw procedure I select 12 PCs for further analysis:

axonGrowth <- JackStraw(axonGrowth, num.replicate = 100)
axonGrowth <- ScoreJackStraw(axonGrowth, dims = 1:20)
JackStrawPlot(axonGrowth, dims = 1:20)

ElbowPlot(axonGrowth, ndims = 40)

n_components = 12

Let’s load the data from https://science.sciencemag.org/content/360/6385/176 as a reference:

reference = readMat('/home/jovyan/data/developingMouse/GSM3017261_150000_CNS_nuclei.mat')

We focus on hippocampal neurons from the mouse brain:

region = unlist(lapply(reference$cluster.assignment, function(x) strsplit(x, split = ' ')[[1]][2]))

subset = which(region == 'HIPP')

reference$DGE = reference$DGE[subset,]
reference$barcodes = reference$barcodes[subset]
reference$cluster.assignment = reference$cluster.assignment[subset]
reference$sample.type = reference$sample.type[subset]
reference$genes = trimws(reference$genes)
subset_genes = which(reference$genes %in% rownames(axonGrowth))
reference$genes = reference$genes[subset_genes]
reference$DGE = reference$DGE[,subset_genes]

reference_matrix = as.matrix(reference$DGE)
reference_matrix = t(reference_matrix)
rownames(reference_matrix) = reference$genes
colnames(reference_matrix) = reference$barcodes

reference$DGE = NULL

Reference = CreateSeuratObject(reference_matrix)

Reference <- NormalizeData(Reference, verbose = FALSE)
Reference <- FindVariableFeatures(Reference, selection.method = "vst", nfeatures = 2000, verbose = FALSE)

Now we integrate the two datasets using the method provided by Seurat (see https://www.sciencedirect.com/science/article/pii/S0092867419305598?via%3Dihub):

axonGrowth.anchors <- FindIntegrationAnchors(object.list = list(Reference, axonGrowth), dims = 1:30, verbose = TRUE, k.filter = 77)
axonGrowth.integrated <- IntegrateData(anchorset = axonGrowth.anchors, dims = 1:30)

Let’s vizualize differentiation trajectories with the markers suggested in the reference paper and see how our data relates to the reference data on a UMAP plot:

# Run the standard workflow for visualization and clustering
axonGrowth.integrated <- ScaleData(axonGrowth.integrated, verbose = FALSE)
axonGrowth.integrated <- RunPCA(axonGrowth.integrated, npcs = 30, verbose = FALSE)
axonGrowth.integrated <- RunUMAP(axonGrowth.integrated, reduction = "pca", dims = 1:5)
axonGrowth.integrated$orig.ident[axonGrowth.integrated$orig.ident == 'SeuratProject'] = 'Reference'
axonGrowth.integrated$orig.ident[axonGrowth.integrated$orig.ident == 'axonGrowth'] = 'Our data'

DimPlot(axonGrowth.integrated, reduction = "umap", group.by = "orig.ident")

FeaturePlot(axonGrowth.integrated, features = c('Mki67', 'Prox1', 'Dock10', 'Meis2', 'Spock1'))

FeaturePlot(axonGrowth.integrated, features = c('Mki67', 'Prox1', 'Dock10', 'Meis2', 'Spock1'), cells = colnames(axonGrowth.integrated)[axonGrowth.integrated$orig.ident == 'Our data'])

Now we find correlated gene modules with WGCNA, see https://horvath.genetics.ucla.edu/html/CoexpressionNetwork/Rpackages/WGCNA/Tutorials/index.html and see to what extent their eigengene correlates with total neurite length and longest neurite length:

options(stringsAsFactors = FALSE);
enableWGCNAThreads()
## Allowing parallel execution with up to 23 working processes.
# Choose a set of soft-thresholding powers
powers = c(c(1:10), seq(from = 12, to=20, by=2))
# Call the network topology analysis function
sft = pickSoftThreshold(t(axonGrowth.integrated@assays$integrated@scale.data), powerVector = powers, verbose = 5)
## pickSoftThreshold: will use block size 2000.
##  pickSoftThreshold: calculating connectivity for given powers...
##    ..working on genes 1 through 2000 of 2000
##    Power SFT.R.sq slope truncated.R.sq  mean.k. median.k.   max.k.
## 1      1    0.878 -3.40          0.968 3.49e+01  3.03e+01 1.27e+02
## 2      2    0.937 -2.42          0.969 1.92e+00  1.06e+00 2.32e+01
## 3      3    0.896 -1.96          0.905 2.61e-01  6.64e-02 6.83e+00
## 4      4    0.382 -2.41          0.349 6.02e-02  5.72e-03 2.50e+00
## 5      5    0.339 -2.05          0.223 1.89e-02  6.17e-04 1.04e+00
## 6      6    0.319 -1.86          0.140 7.24e-03  7.39e-05 5.32e-01
## 7      7    0.346 -2.70          0.247 3.20e-03  9.76e-06 3.47e-01
## 8      8    0.353 -2.54          0.265 1.58e-03  1.37e-06 2.33e-01
## 9      9    0.357 -2.35          0.275 8.43e-04  1.95e-07 1.59e-01
## 10    10    0.364 -2.28          0.283 4.80e-04  2.76e-08 1.10e-01
## 11    12    0.359 -2.11          0.281 1.78e-04  6.31e-10 5.39e-02
## 12    14    0.335 -2.15          0.149 7.38e-05  1.41e-11 2.70e-02
## 13    16    0.325 -1.96          0.142 3.28e-05  3.40e-13 1.37e-02
## 14    18    0.330 -1.91          0.147 1.52e-05  8.77e-15 6.98e-03
## 15    20    0.335 -1.97          0.149 7.24e-06  2.22e-16 3.59e-03
# Plot the results:
sizeGrWindow(9, 5)
par(mfrow = c(1,2));
cex1 = 0.9;
# Scale-free topology fit index as a function of the soft-thresholding power
plot(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
xlab="Soft Threshold (power)",ylab="Scale Free Topology Model Fit,signed R^2",type="n",
main = paste("Scale independence"));
text(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
labels=powers,cex=cex1,col="red");
# this line corresponds to using an R^2 cut-off of h
abline(h=0.90,col="red")
# Mean connectivity as a function of the soft-thresholding power
plot(sft$fitIndices[,1], sft$fitIndices[,5],
xlab="Soft Threshold (power)",ylab="Mean Connectivity", type="n",
main = paste("Mean connectivity"))
text(sft$fitIndices[,1], sft$fitIndices[,5], labels=powers, cex=cex1,col="red")
softPower = 5;
adjacency = adjacency(t(axonGrowth.integrated@assays$integrated@scale.data), power = softPower)
TOM = TOMsimilarity(adjacency);
## ..connectivity..
## ..matrix multiplication (system BLAS)..
## ..normalization..
## ..done.
dissTOM = 1-TOM
geneTree = hclust(as.dist(dissTOM), method = "average");
# Plot the resulting clustering tree (dendrogram)
sizeGrWindow(12,9)
plot(geneTree, xlab="", sub="", main = "Gene clustering on TOM-based dissimilarity",
labels = FALSE, hang = 0.04);
# We like large modules, so we set the minimum module size relatively high:
minModuleSize = 30;
# Module identification using dynamic tree cut:
dynamicMods = cutreeDynamic(dendro = geneTree, distM = dissTOM,
deepSplit = 2, pamRespectsDendro = FALSE,
minClusterSize = minModuleSize);
##  ..cutHeight not given, setting it to 1  ===>  99% of the (truncated) height range in dendro.
##  ..done.
table(dynamicMods)
## dynamicMods
##    0    1    2    3 
## 1596  265   80   59
# Convert numeric lables into colors
dynamicColors = labels2colors(dynamicMods)
table(dynamicColors)
## dynamicColors
##      blue     brown      grey turquoise 
##        80        59      1596       265
# Plot the dendrogram and colors underneath
sizeGrWindow(8,6)
plotDendroAndColors(geneTree, dynamicColors, "Dynamic Tree Cut",
dendroLabels = FALSE, hang = 0.03,
addGuide = TRUE, guideHang = 0.05,
main = "Gene dendrogram and module colors")

datTraits = data.frame(TotalNeuriteLength = axonGrowth.integrated$TotalNeuriteLength,
                       LongestNeuriteLength = axonGrowth.integrated$LongestNeuriteLength)

MEs0 = moduleEigengenes(t(axonGrowth.integrated@assays$integrated@scale.data), dynamicColors)$eigengenes
MEs = orderMEs(MEs0)
moduleTraitCor = cor(MEs, datTraits, use = "p");
nSamples = dim(axonGrowth.integrated@assays$integrated@scale.data)[2]
moduleTraitPvalue = corPvalueStudent(moduleTraitCor, nSamples);

sizeGrWindow(10,6)
# Will display correlations and their p-values
textMatrix = paste(signif(moduleTraitCor, 2), "\n(",
signif(moduleTraitPvalue, 1), ")", sep = "");
dim(textMatrix) = dim(moduleTraitCor)
par(mar = c(6, 8.5, 3, 3));
# Display the correlation values within a heatmap plot
labeledHeatmap(Matrix = moduleTraitCor,
xLabels = names(datTraits),
yLabels = names(MEs),
ySymbols = names(MEs),
colorLabels = FALSE,
colors = greenWhiteRed(50),
textMatrix = textMatrix,
setStdMargins = FALSE,
cex.text = 0.5,
zlim = c(-1,1),
main = paste("Module-trait relationships"))

Finally let’s see how the gene modules relate to the UMAP plot:

axonGrowth.integrated$MEturquoise = MEs$MEturquoise
axonGrowth.integrated$MEblue = MEs$MEblue
axonGrowth.integrated$MEbrown = MEs$MEbrown
axonGrowth.integrated$MEgrey = MEs$MEgrey

FeaturePlot(axonGrowth.integrated, features = c('MEturquoise', 'MEblue', 'MEbrown', 'MEgrey'))

And what go functions they are enriched for:

library(topGO)

for (module in c('turquoise', 'blue', 'brown', 'grey')){
  print(module)
  all_genes = rep(0,dim(axonGrowth.integrated@assays$integrated@scale.data)[1])
names(all_genes) = rownames(axonGrowth.integrated@assays$integrated@scale.data)
all_genes[dynamicColors == module] = 1
GOdata <- new("topGOdata", ontology = "BP", allGenes = all_genes, geneSel = function(p) p < 
    0.05, description = "Test", annot = annFUN.org, mapping = "org.Mm.eg.db", 
    ID = "symbol")

resultFisher <- runTest(GOdata, algorithm = "classic", statistic = "fisher")
tab = GenTable(GOdata, classicFisher = resultFisher, topNodes = 10)
d = as.data.frame(as.matrix(tab[,c(1,2,6)]))
p = grid.table(d)
print(p)
print(d)
}

## [1] "turquoise"
## NULL
##         GO.ID                                Term classicFisher
## 1  GO:0007049                          cell cycle       5.3e-05
## 2  GO:0009056                   catabolic process       0.00019
## 3  GO:0010564    regulation of cell cycle process       0.00026
## 4  GO:0022402                  cell cycle process       0.00043
## 5  GO:0002376               immune system process       0.00046
## 6  GO:0008152                   metabolic process       0.00051
## 7  GO:0071704 organic substance metabolic process       0.00055
## 8  GO:0044238           primary metabolic process       0.00058
## 9  GO:0044237          cellular metabolic process       0.00066
## 10 GO:0006807 nitrogen compound metabolic process       0.00068
## [1] "blue"
## NULL
##         GO.ID                                        Term classicFisher
## 1  GO:0007017                   microtubule-based process         0.018
## 2  GO:1901615 organic hydroxy compound metabolic proce...         0.037
## 3  GO:0000226       microtubule cytoskeleton organization         0.052
## 4  GO:0016358                        dendrite development         0.052
## 5  GO:0001701              in utero embryonic development         0.057
## 6  GO:0032990                     cell part morphogenesis         0.060
## 7  GO:0051301                               cell division         0.060
## 8  GO:0048858               cell projection morphogenesis         0.061
## 9  GO:0120039 plasma membrane bounded cell projection ...         0.061
## 10 GO:0030182                      neuron differentiation         0.062
## [1] "brown"
## NULL
##         GO.ID                                        Term classicFisher
## 1  GO:0009059          macromolecule biosynthetic process        0.0021
## 2  GO:1901360 organic cyclic compound metabolic proces...        0.0028
## 3  GO:0032989            cellular component morphogenesis        0.0033
## 4  GO:0034645 cellular macromolecule biosynthetic proc...        0.0034
## 5  GO:0000902                          cell morphogenesis        0.0050
## 6  GO:0090304              nucleic acid metabolic process        0.0054
## 7  GO:0006725 cellular aromatic compound metabolic pro...        0.0055
## 8  GO:0016070                       RNA metabolic process        0.0057
## 9  GO:0046483               heterocycle metabolic process        0.0065
## 10 GO:0099536                          synaptic signaling        0.0065
## [1] "grey"
## NULL
##         GO.ID                                        Term classicFisher
## 1  GO:0009605               response to external stimulus       1.1e-10
## 2  GO:0002274                myeloid leukocyte activation       7.3e-08
## 3  GO:0030593                       neutrophil chemotaxis       7.5e-08
## 4  GO:0071621                      granulocyte chemotaxis       2.3e-07
## 5  GO:0098742 cell-cell adhesion via plasma-membrane a...       2.3e-07
## 6  GO:0032879                  regulation of localization       4.5e-07
## 7  GO:0007155                               cell adhesion       5.2e-07
## 8  GO:0051049                     regulation of transport       6.0e-07
## 9  GO:0050808                        synapse organization       6.0e-07
## 10 GO:0048002 antigen processing and presentation of p...       6.2e-07

The ‘blue module’ is moderately correlated with Neurite Length and enriched for known GO processes involved in neurite growth. The genes contained in this module are printed out below:

print(names(all_genes)[dynamicColors == 'blue'])
##  [1] "Csf1r"     "Spp1"      "Anxa3"     "Trem2"     "Apobec1"   "Plin2"    
##  [7] "Hvcn1"     "Itgam"     "Gpx1"      "Sh3bgrl3"  "Lgals3"    "Fth1"     
## [13] "Lyz2"      "Ftl1"      "Ctsz"      "Ctss"      "Cd68"      "Gpnmb"    
## [19] "Capg"      "Laptm5"    "B2m"       "Tyrobp"    "Mmp12"     "Fcer1g"   
## [25] "H2-D1"     "Arpc1b"    "Cybb"      "Anxa1"     "Mfge8"     "Mpeg1"    
## [31] "Fcgr3"     "Cyba"      "C3ar1"     "Trf"       "H2-K1"     "Anxa2"    
## [37] "Ccl9"      "Cd53"      "Sat1"      "C5ar1"     "Pfn1"      "Itgb2"    
## [43] "Gstm1"     "Cd36"      "Atp6v0d2"  "Unc93b1"   "Cd52"      "Tnfrsf12a"
## [49] "Cav2"      "Plaur"     "Aldoc"     "Cd93"      "Ninj1"     "Cdkn1a"   
## [55] "Pirb"      "S100a1"    "Rac2"      "Cd72"      "Fbxo43"    "Hoxa3"    
## [61] "Tmem217"   "Tspo"      "Ankrd37"   "Swsap1"    "Tmem119"   "Bank1"    
## [67] "Naip6"     "Ado"       "Oas1a"     "Mgst1"     "Nfe2l2"    "Dnase1l1" 
## [73] "Snrnp25"   "Mvp"       "Gsn"       "Kcnn4"     "Uap1l1"    "Tlr2"     
## [79] "Tlr13"     "Cldn3"

However, the below plot that shows Longest Neurite Length as a function of this modules eigengene (PC1), shows no clear association between the two:

plot(axonGrowth.integrated$MEblue, axonGrowth.integrated$TotalNeuriteLength, pch = '.', cex = 10, xlab = 'Blue Module PCA1-score', ylab = 'LongestNeuriteLength')